---
title: "Prediting future population"
output: 
  html_document:
    toc: true
    theme: united
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, cache = TRUE)
```

# Load data
```{r, message=FALSE}
library(readstata13) # package to read Stata 13 dta files
library(tidyverse)
library(lubridate) # Working with dates
library(parallel)
library(multidplyr)
cluster <- new_cluster(7)
```

```{r}
Municipality_data <- read.dta13("../Data/Municipality_data.dta") %>%
  as_tibble()
```

# Completing population data

## Does population change a lot over time ?

```{r}

population_over_time <- Municipality_data %>%
  select(Code, starts_with("Population")) %>%
  unique() %>%
  gather(year, Population, -Code) %>%
  mutate(year = gsub("Populationen", "", year)) %>%
  mutate(year = as.numeric(year), Population = as.numeric(Population)) %>%
  filter(!is.na(Population))
```

## Glimpse of population over time

### Normal scale

```{r}
set.seed(0)
population_over_time %>%
  filter(Code %in% sample(x = unique(Code), size = 16)) %>%
  ggplot(aes(x = year, y = Population)) +
  theme_classic() +
  facet_wrap(~Code) +
  geom_point() +
  geom_line() +
  geom_smooth(method = "lm", se = FALSE, formula = y ~ x) + 
  theme(axis.text.x = element_text(angle = 45, vjust = 0.5))
```

### Log scale


```{r}
set.seed(0)
population_over_time %>%
  filter(is.finite(log(Population))) %>%
  filter(Code %in% sample(x = unique(Code), size = 16)) %>%
  ggplot(aes(x = year, y = log(Population))) +
  theme_classic() +
  facet_wrap(~Code) +
  geom_point() +
  geom_line() +
  geom_smooth(method = "lm", se = FALSE, formula = y ~ x)
```

## Fitting linear models to all municipalities

```{r}
set.seed(0)

loglinear_fits <- population_over_time %>%
  filter(is.finite(log(Population))) %>%
  group_by(Code) %>%
  do(fits = lm(log(Population) ~ year, data = .) %>% broom::glance()) %>%
  unnest(cols = fits)
```

```{r}
set.seed(0)

linear_fits <- population_over_time %>%
  group_by(Code) %>%
  do(fits = lm(Population ~ year, data = .) %>% broom::glance()) %>%
  unnest(cols = fits)
```

## Looking at the worst fitting models in case a linear model is good enough

```{r}
ill_fitting_municipalities <- linear_fits %>%
  arrange(r.squared) %>%
  head(n=16) %>%
  .$Code

population_over_time %>%
  filter(Code %in% ill_fitting_municipalities) %>%
  ggplot(aes(x = year, y = Population)) +
  theme_classic() +
  facet_wrap(~Code) +
  geom_point() +
  geom_line() +
  geom_smooth(method = "lm", formula = y ~ x)
```

```{r}
ill_fitting_municipalities <- loglinear_fits %>%
  arrange(r.squared) %>%
  head(n=16) %>%
  .$Code

population_over_time %>%
  filter(is.finite(log(Population))) %>%
  filter(Code %in% ill_fitting_municipalities) %>%
  ggplot(aes(x = year, y = log(Population))) +
  theme_classic() +
  facet_wrap(~Code) +
  geom_point() +
  geom_line() +
  geom_smooth(method = "lm", formula = y ~ x)
```

```{r}
increase_factors <- population_over_time %>%
  filter(is.finite(log(Population))) %>%
  group_by(Code) %>%
  do(increase = lm(log(Population) ~ year, data = .)$coefficients[["year"]]) %>%
  mutate(increase = increase[[1]]) 

increase_factors$increase %>%
  hist()
```

## Fitting constant models to all municipalities

```{r}
set.seed(0)

logconstant_fits <- population_over_time %>%
  filter(is.finite(log(Population))) %>%
  group_by(Code) %>%
  do(fits = lm(log(Population) ~ 1, data = .) %>% broom::glance()) %>%
  unnest(cols = fits)
```

```{r}
set.seed(0)

constant_fits <- population_over_time %>%
  group_by(Code) %>%
  do(fits = lm(Population ~ 1, data = .) %>% broom::glance()) %>%
  unnest(cols = fits)
```

## Looking at the worst fitting models in case a linear model is good enough

```{r}
ill_fitting_municipalities <- constant_fits %>%
  arrange(r.squared) %>%
  head(n = 16) %>%
  .$Code

population_over_time %>%
  filter(Code %in% ill_fitting_municipalities) %>%
  ggplot(aes(x = year, y = Population)) +
  theme_classic() +
  facet_wrap(~Code) +
  geom_point() +
  geom_line() +
  geom_smooth(method = "lm", formula = y ~ 1)
```

```{r}
ill_fitting_municipalities <- logconstant_fits %>%
  arrange(r.squared) %>%
  head(n=16) %>%
  .$Code

population_over_time %>%
  filter(is.finite(log(Population))) %>%
  filter(Code %in% ill_fitting_municipalities) %>%
  ggplot(aes(x = year, y = log(Population))) +
  theme_classic() +
  facet_wrap(~Code) +
  geom_point() +
  geom_line() +
  geom_smooth(method = "lm", formula = y ~ 1)
```

# Selecting best model among constant or linear

## Normal scale

```{r}
selected_models <- linear_fits %>%
  select(Code, AIC, BIC) %>%
  rename(AIC_linear = AIC, BIC_linear = BIC) %>%
  left_join(constant_fits %>% select(Code, AIC, BIC)) %>%
  rename(AIC_constant = AIC, BIC_constant = BIC) %>%
  mutate(Delta_AIC = AIC_linear - AIC_constant, Delta_BIC = BIC_linear - BIC_constant) %>%
  mutate(
    preferred_model_AIC = ifelse(test = Delta_AIC <= -3, yes = "linear", no = "constant"),
    preferred_model_BIC = ifelse(test = Delta_BIC <= -3, yes = "linear", no = "constant")
  )

selected_models %>%
  summarise(
    proportion_linear_AIC = length(which(preferred_model_AIC == "linear")) / length(preferred_model_AIC),
    proportion_linear_BIC = length(which(preferred_model_BIC == "linear")) / length(preferred_model_BIC)
  )
```
## Log scale

```{r}
selected_models_log <- loglinear_fits %>%
  select(Code, AIC, BIC) %>%
  rename(AIC_linear = AIC, BIC_linear = BIC) %>%
  left_join(logconstant_fits %>% select(Code, AIC, BIC)) %>%
  rename(AIC_constant = AIC, BIC_constant = BIC) %>%
  mutate(Delta_AIC = AIC_linear - AIC_constant, Delta_BIC = BIC_linear - BIC_constant) %>%
  mutate(
    preferred_model_AIC = ifelse(test = Delta_AIC <= -3, yes = "linear", no = "constant"),
    preferred_model_BIC = ifelse(test = Delta_BIC <= -3, yes = "linear", no = "constant")
  ) # Selected more complex model only when there is a substantial difference.

selected_models_log %>%
  summarise(
    proportion_linear_AIC = length(which(preferred_model_AIC == "linear")) / length(preferred_model_AIC),
    proportion_linear_BIC = length(which(preferred_model_BIC == "linear")) / length(preferred_model_BIC)
  )
```


## Log and natural scale together

```{r}
select_with_threshold <- function(xIC_constant, xIC_linear, xIC_logconstant, xIC_loglinear) {
  complexity_ordered_xIC <- c(xIC_constant, xIC_logconstant, xIC_linear, xIC_loglinear)
  complexity_ordered_names <- c("constant", "logconstant", "linear", "loglinear")
  id_smallest <- which.min(complexity_ordered_xIC)
  while (id_smallest > 1) {
    if (complexity_ordered_xIC[id_smallest] - complexity_ordered_xIC[id_smallest - 1] <= -3) {
      return(complexity_ordered_names[id_smallest])
    } else {
      id_smallest <- id_smallest - 1
    }
  }
  return("constant") # Branch taken only if id_smallest = 1
}


selected_models_all <- list(
  linear_fits %>%
    select(Code, AIC, BIC) %>%
    rename(AIC_linear = AIC, BIC_linear = BIC),
  constant_fits %>%
    select(Code, AIC, BIC) %>%
    rename(AIC_constant = AIC, BIC_constant = BIC),
  loglinear_fits %>%
    select(Code, AIC, BIC) %>%
    rename(AIC_loglinear = AIC, BIC_loglinear = BIC),
  logconstant_fits %>%
    select(Code, AIC, BIC) %>%
    rename(AIC_logconstant = AIC, BIC_logconstant = BIC)
) %>%
  Reduce(left_join, .) %>%
  mutate(
    preferred_model_AIC = mapply(select_with_threshold, AIC_constant, AIC_linear, AIC_logconstant, AIC_loglinear),
    preferred_model_BIC = mapply(select_with_threshold, BIC_constant, BIC_linear, BIC_logconstant, BIC_loglinear)
  )


selected_models_all_dict <- selected_models_all$preferred_model_BIC %>% setNames(selected_models_all$Code)

selected_models_all %>%
  select(starts_with("preferred")) %>%
  lapply(table)
```

Preferred model is overwhelmingly the loglinear model, sign that we should include the log version of the model.

# Prediction according to preferred model

```{r}

fitmodel_and_predict <- function(df, modelname, missing_years) {
  res <- tibble(year = missing_years)
  if (modelname == "loglinear") {
    ft <- lm(formula = log(Population) ~ year, data = df)
    return(res %>% 
             mutate(Population = round(exp(predict(ft, newdata = res))),
                    adj_rsquared = summary(ft)$adj.r.squared)
           )
  }
  else if (modelname == "logconstant") {
    ft <- lm(formula = log(Population) ~ 1, data = df)
    return(res %>% 
             mutate(Population = round(exp(predict(ft, newdata = res))),
                    adj_rsquared = summary(ft)$adj.r.squared)
           )
  }
  else if (modelname == "constant") {
    ft <- lm(formula = Population ~ 1, data = df)
    return(res %>% 
             mutate(Population = round(predict(ft, newdata = res)),
                    adj_rsquared = summary(ft)$adj.r.squared)
           )
  }
  else if (modelname == "linear") {
    ft <- lm(formula = Population ~ year, data = df)
    return(res %>% 
             mutate(Population = round(predict(ft, newdata = res)),
                    adj_rsquared = summary(ft)$adj.r.squared)
           )
  }
}

for_one_municipality <- function(Municipality_code) {
  Municipality_pop <- population_over_time %>%
    filter(Code == Municipality_code)
  missing_years <- setdiff(2010:2021, Municipality_pop$year)

  fitmodel_and_predict(df = Municipality_pop, modelname = selected_models_all_dict[Municipality_code], missing_years = missing_years) %>%
    mutate(Code = Municipality_code, predicted = "yes")
}

population_over_time_completed = population_over_time$Code %>%
  unique() %>%
  parallel::mclapply(FUN = for_one_municipality, mc.preschedule = T, mc.cores = 7) %>%
  bind_rows() %>% 
  bind_rows(population_over_time %>% mutate(predicted = "no"))

population_over_time_completed
```
```{r, cache=FALSE}
saveRDS(file = "../Results/Population_data_extrapolated.rds", object = population_over_time_completed)
readr::write_csv(population_over_time_completed, file = "../Results/Population_data_extrapolated.csv", col_names = T)
```


```{r}
set.seed(0)
random_sample_municipalities <- population_over_time_completed$Code %>%
  sample(size = 25, replace = F)

population_over_time_completed %>%
  filter(Code %in% random_sample_municipalities) %>%
  ggplot(aes(x = year, y = Population, colour = predicted)) +
  theme_classic() +
  facet_wrap(~Code, scales = "free_y") +
  geom_point() +
  geom_line() + 
  theme(axis.text.x = element_text(angle = 45, vjust = 0.5))
```


```{r}
ill_fitting_municipalities <- population_over_time_completed %>%
  select(Code, adj_rsquared) %>% 
  unique() %>% 
  arrange(adj_rsquared) %>%
  head(n=25) %>%
  .$Code
  
population_over_time_completed %>%
  filter(Code %in% ill_fitting_municipalities) %>%
  ggplot(aes(x = year, y = Population, colour = predicted)) +
  theme_classic() +
  facet_wrap(~Code) +
  geom_point() +
  geom_line() + 
  theme(axis.text.x = element_text(angle = 45, vjust = 0.5))

population_over_time_completed %>%
  filter(Code %in% ill_fitting_municipalities) %>%
  ggplot(aes(x = year, y = Population, colour = predicted)) +
  theme_classic() +
  facet_wrap(~Code, scales = "free_y") +
  geom_point() +
  geom_line() + 
  theme(axis.text.x = element_text(angle = 45, vjust = 0.5))
```

## Filtering out those for which the adjusted R-square is equal to 0

```{r}
ill_fitting_municipalities <- population_over_time_completed %>%
  select(Code, adj_rsquared) %>% 
  unique() %>% 
  filter(adj_rsquared>0.001) %>%
  arrange(adj_rsquared) %>%
  head(n=25) %>%
  .$Code
  
population_over_time_completed %>%
  filter(Code %in% ill_fitting_municipalities) %>%
  ggplot(aes(x = year, y = Population, colour = predicted)) +
  theme_classic() +
  facet_wrap(~Code) +
  geom_point() +
  geom_line() + 
  theme(axis.text.x = element_text(angle = 45, vjust = 0.5))

population_over_time_completed %>%
  filter(Code %in% ill_fitting_municipalities) %>%
  ggplot(aes(x = year, y = Population, colour = predicted)) +
  theme_classic() +
  facet_wrap(~Code, scales = "free_y") +
  geom_point() +
  geom_line() + 
  theme(axis.text.x = element_text(angle = 45, vjust = 0.5))
```
